# Import the pandas library for data manipulationimport pandas as pd# Import numpy for numerical operationsimport numpy as np# Import matplotlib.pylab for plottingimport matplotlib.pylab as plt# Import seaborn for statistical data visualizationimport seaborn as sns# Import polars for DataFrame operationsimport polars as pl# Import col and when from polars for expression buildingfrom polars import col, when# Import datetime for date and time operationsfrom datetime import datetime# Import duckdb for in-memory SQL database operationsimport duckdb
2 import data from csv
Code
# Read the CSV file './data/mtcars.csv' into a Polars DataFramemtcars = pl.read_csv("./data/mtcars.csv")
3 import data from pandas
Code
# Read the CSV file into a pandas DataFramemtcars_pd= pd.read_csv("./data/mtcars.csv")# Print the type of mtcars_pdtype(mtcars_pd)
<class 'pandas.core.frame.DataFrame'>
Code
# Convert the pandas DataFrame to a Polars DataFramemtcars = pl.from_pandas(mtcars_pd)# Print the type of mtcarstype(mtcars)
<class 'polars.dataframe.frame.DataFrame'>
Code
# Select the 'cyl', 'mpg', and 'hp' columns from the mtcars DataFrame and take the first 5 rowssmall_mtcars = mtcars.select(["cyl", "mpg","hp"]).head(5)
4 select column
4.1 get column names
Code
# Get the column names of the small_mtcars DataFramesmall_mtcars.columns
['cyl', 'mpg', 'hp']
4.2 select columns by name
Code
# Select the 'cyl' and 'mpg' columns from the small_mtcars DataFramesmall_mtcars.select(["cyl", "mpg"])
shape: (5, 2)
cyl
mpg
i64
f64
6
21.0
6
21.0
4
22.8
6
21.4
8
18.7
4.3 select columns by name match with ‘p’
Code
# Select columns with 'p' in the namecols_with_p = [col for col in small_mtcars.columns if'p'in col]# Select the columns identified in cols_with_p from the small_mtcars DataFramesmall_mtcars.select(cols_with_p)
shape: (5, 2)
mpg
hp
f64
i64
21.0
110
21.0
110
22.8
93
21.4
110
18.7
175
4.4 select columns by index
4.4.1 select first and 3rd columns
Code
# Select the first (index 0) and third (index 2) columns of the small_mtcars DataFramesmall_mtcars.select([small_mtcars.columns[0], small_mtcars.columns[2]])
shape: (5, 2)
cyl
hp
i64
i64
6
110
6
110
4
93
6
110
8
175
4.4.2 select first to 3rd columns
Code
# Select columns from index 0 up to (but not including) index 3small_mtcars.select(small_mtcars.columns[0:3])
shape: (5, 3)
cyl
mpg
hp
i64
f64
i64
6
21.0
110
6
21.0
110
4
22.8
93
6
21.4
110
8
18.7
175
5 drop column
Code
# Drop the column named "cyl" from the small_mtcars DataFramesmall_mtcars.drop("cyl")
shape: (5, 2)
mpg
hp
f64
i64
21.0
110
21.0
110
22.8
93
21.4
110
18.7
175
6 Renaming column
Code
# Rename the column 'mpg' to 'new_name_mpg'small_mtcars.rename({"mpg": "new_name_mpg"})
shape: (5, 3)
cyl
new_name_mpg
hp
i64
f64
i64
6
21.0
110
6
21.0
110
4
22.8
93
6
21.4
110
8
18.7
175
7 Create column
7.1 Mutate
Code
# Apply transformations to the mtcars DataFrameresult = ( mtcars.head() .with_columns([# Create a new column 'gear2' by adding 1 to the 'gear' column (col("gear") +1).alias("gear2")# Create a new column 'gear3': if 'gear' > 3 then 'long', else 'short' ,when(col("gear") >3).then(pl.lit("long")) .otherwise(pl.lit("short")).alias("gear3")# Create a new column 'qsec2' with conditional logic based on 'qsec' values ,when(col("qsec") <=17).then(pl.lit("short")) .when(col("qsec") <=18).then(pl.lit("Medium")) .otherwise(pl.lit("long")).alias("qsec2") ]))# Print the resulting DataFrameprint(result)
7.2 Transmute,create column and only keep this column
Code
# Select the 'gear' column from the first few rows of mtcars, add 1 to it, and alias it as 'gear2'mtcars.head().select([ (col("gear") +1).alias("gear2") ])
shape: (5, 1)
gear2
i64
5
5
5
4
4
8 Filter rows
Code
# Filter rows where the 'gear' column is equal to 4mtcars.filter( (col("gear") ==4))
shape: (12, 12)
Unnamed: 0
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
str
f64
i64
f64
i64
f64
f64
f64
i64
i64
i64
i64
"Mazda RX4"
21.0
6
160.0
110
3.9
2.62
16.46
0
1
4
4
"Mazda RX4 Wag"
21.0
6
160.0
110
3.9
2.875
17.02
0
1
4
4
"Datsun 710"
22.8
4
108.0
93
3.85
2.32
18.61
1
1
4
1
"Merc 240D"
24.4
4
146.7
62
3.69
3.19
20.0
1
0
4
2
"Merc 230"
22.8
4
140.8
95
3.92
3.15
22.9
1
0
4
2
…
…
…
…
…
…
…
…
…
…
…
…
"Fiat 128"
32.4
4
78.7
66
4.08
2.2
19.47
1
1
4
1
"Honda Civic"
30.4
4
75.7
52
4.93
1.615
18.52
1
1
4
2
"Toyota Corolla"
33.9
4
71.1
65
4.22
1.835
19.9
1
1
4
1
"Fiat X1-9"
27.3
4
79.0
66
4.08
1.935
18.9
1
1
4
1
"Volvo 142E"
21.4
4
121.0
109
4.11
2.78
18.6
1
1
4
2
8.1 Filters with AND conditions
Code
# Filter rows where 'cyl' is greater than 4 AND 'gear' is equal to 5mtcars.filter( (col("cyl") >4) & (col("gear") ==5))
shape: (3, 12)
Unnamed: 0
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
str
f64
i64
f64
i64
f64
f64
f64
i64
i64
i64
i64
"Ford Pantera L"
15.8
8
351.0
264
4.22
3.17
14.5
0
1
5
4
"Ferrari Dino"
19.7
6
145.0
175
3.62
2.77
15.5
0
1
5
6
"Maserati Bora"
15.0
8
301.0
335
3.54
3.57
14.6
0
1
5
8
8.2 Filters with OR conditions
Code
# Filter rows where 'cyl' is equal to 6 OR 'gear' is equal to 5mtcars.filter( (col("cyl") ==6) | (col("gear") ==5))
shape: (11, 12)
Unnamed: 0
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
str
f64
i64
f64
i64
f64
f64
f64
i64
i64
i64
i64
"Mazda RX4"
21.0
6
160.0
110
3.9
2.62
16.46
0
1
4
4
"Mazda RX4 Wag"
21.0
6
160.0
110
3.9
2.875
17.02
0
1
4
4
"Hornet 4 Drive"
21.4
6
258.0
110
3.08
3.215
19.44
1
0
3
1
"Valiant"
18.1
6
225.0
105
2.76
3.46
20.22
1
0
3
1
"Merc 280"
19.2
6
167.6
123
3.92
3.44
18.3
1
0
4
4
…
…
…
…
…
…
…
…
…
…
…
…
"Porsche 914-2"
26.0
4
120.3
91
4.43
2.14
16.7
0
1
5
2
"Lotus Europa"
30.4
4
95.1
113
3.77
1.513
16.9
1
1
5
2
"Ford Pantera L"
15.8
8
351.0
264
4.22
3.17
14.5
0
1
5
4
"Ferrari Dino"
19.7
6
145.0
175
3.62
2.77
15.5
0
1
5
6
"Maserati Bora"
15.0
8
301.0
335
3.54
3.57
14.6
0
1
5
8
8.3 filter row with index
8.3.1 first 3
Code
# Select the first 3 rows of the small_mtcars DataFramesmall_mtcars.head(3)
shape: (3, 3)
cyl
mpg
hp
i64
f64
i64
6
21.0
110
6
21.0
110
4
22.8
93
8.3.2 last 3
Code
# Select the last 3 rows of the small_mtcars DataFramesmall_mtcars.tail(3)
shape: (3, 3)
cyl
mpg
hp
i64
f64
i64
4
22.8
93
6
21.4
110
8
18.7
175
8.3.3 5th rows
Code
# Select the row at index 4 (which is the 5th row)mtcars[4]
shape: (1, 12)
Unnamed: 0
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
str
f64
i64
f64
i64
f64
f64
f64
i64
i64
i64
i64
"Hornet Sportabout"
18.7
8
360.0
175
3.15
3.44
17.02
0
0
3
2
8.3.4 1 and 5th rows
Code
# Select rows at index 0 (1st row) and 4 (5th row)mtcars[[0, 4]]
shape: (2, 12)
Unnamed: 0
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
str
f64
i64
f64
i64
f64
f64
f64
i64
i64
i64
i64
"Mazda RX4"
21.0
6
160.0
110
3.9
2.62
16.46
0
1
4
4
"Hornet Sportabout"
18.7
8
360.0
175
3.15
3.44
17.02
0
0
3
2
8.3.5 get 1 to 4 rows
Code
# Select rows from index 0 up to (but not including) index 4mtcars[0:4]
shape: (4, 12)
Unnamed: 0
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
str
f64
i64
f64
i64
f64
f64
f64
i64
i64
i64
i64
"Mazda RX4"
21.0
6
160.0
110
3.9
2.62
16.46
0
1
4
4
"Mazda RX4 Wag"
21.0
6
160.0
110
3.9
2.875
17.02
0
1
4
4
"Datsun 710"
22.8
4
108.0
93
3.85
2.32
18.61
1
1
4
1
"Hornet 4 Drive"
21.4
6
258.0
110
3.08
3.215
19.44
1
0
3
1
8.3.6 get ramdon 5 rows
Code
# Select 5 random rows from the mtcars DataFrame, with a fixed seed for reproducibilitymtcars.sample(n=5, seed=42)
shape: (5, 12)
Unnamed: 0
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
str
f64
i64
f64
i64
f64
f64
f64
i64
i64
i64
i64
"Hornet Sportabout"
18.7
8
360.0
175
3.15
3.44
17.02
0
0
3
2
"Ferrari Dino"
19.7
6
145.0
175
3.62
2.77
15.5
0
1
5
6
"Merc 230"
22.8
4
140.8
95
3.92
3.15
22.9
1
0
4
2
"AMC Javelin"
15.2
8
304.0
150
3.15
3.435
17.3
0
0
3
2
"Fiat 128"
32.4
4
78.7
66
4.08
2.2
19.47
1
1
4
1
9 Append
9.1 append by row
Code
# Select rows from index 0 up to (but not including) index 4data1 = mtcars[0:4]# Select rows from index 10 up to (but not including) index 11data2 = mtcars[10:11]# Concatenate data1 and data2 DataFrames verticallypl.concat([data1, data2], how="vertical")
shape: (5, 12)
Unnamed: 0
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
str
f64
i64
f64
i64
f64
f64
f64
i64
i64
i64
i64
"Mazda RX4"
21.0
6
160.0
110
3.9
2.62
16.46
0
1
4
4
"Mazda RX4 Wag"
21.0
6
160.0
110
3.9
2.875
17.02
0
1
4
4
"Datsun 710"
22.8
4
108.0
93
3.85
2.32
18.61
1
1
4
1
"Hornet 4 Drive"
21.4
6
258.0
110
3.08
3.215
19.44
1
0
3
1
"Merc 280C"
17.8
6
167.6
123
3.92
3.44
18.9
1
0
4
4
9.2 append by column
Code
# Select the 'mpg' column from small_mtcarsdata1 = small_mtcars.select("mpg")# Select the 'cyl' column from small_mtcarsdata2 = small_mtcars.select("cyl")# Concatenate data1 and data2 DataFrames horizontallypl.concat([data1, data2], how="horizontal")
shape: (5, 2)
mpg
cyl
f64
i64
21.0
6
21.0
6
22.8
4
21.4
6
18.7
8
9.3 Dropping NA values
Code
# Drop all rows that contain any null valuesmtcars.drop_nulls()
shape: (32, 12)
Unnamed: 0
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
str
f64
i64
f64
i64
f64
f64
f64
i64
i64
i64
i64
"Mazda RX4"
21.0
6
160.0
110
3.9
2.62
16.46
0
1
4
4
"Mazda RX4 Wag"
21.0
6
160.0
110
3.9
2.875
17.02
0
1
4
4
"Datsun 710"
22.8
4
108.0
93
3.85
2.32
18.61
1
1
4
1
"Hornet 4 Drive"
21.4
6
258.0
110
3.08
3.215
19.44
1
0
3
1
"Hornet Sportabout"
18.7
8
360.0
175
3.15
3.44
17.02
0
0
3
2
…
…
…
…
…
…
…
…
…
…
…
…
"Lotus Europa"
30.4
4
95.1
113
3.77
1.513
16.9
1
1
5
2
"Ford Pantera L"
15.8
8
351.0
264
4.22
3.17
14.5
0
1
5
4
"Ferrari Dino"
19.7
6
145.0
175
3.62
2.77
15.5
0
1
5
6
"Maserati Bora"
15.0
8
301.0
335
3.54
3.57
14.6
0
1
5
8
"Volvo 142E"
21.4
4
121.0
109
4.11
2.78
18.6
1
1
4
2
9.4 To drop rows with NAs in specific columns:
Code
# Drop rows that have null values in either the "mpg" or "disp" columnsmtcars.drop_nulls(subset=["mpg", "disp"])
shape: (32, 12)
Unnamed: 0
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
str
f64
i64
f64
i64
f64
f64
f64
i64
i64
i64
i64
"Mazda RX4"
21.0
6
160.0
110
3.9
2.62
16.46
0
1
4
4
"Mazda RX4 Wag"
21.0
6
160.0
110
3.9
2.875
17.02
0
1
4
4
"Datsun 710"
22.8
4
108.0
93
3.85
2.32
18.61
1
1
4
1
"Hornet 4 Drive"
21.4
6
258.0
110
3.08
3.215
19.44
1
0
3
1
"Hornet Sportabout"
18.7
8
360.0
175
3.15
3.44
17.02
0
0
3
2
…
…
…
…
…
…
…
…
…
…
…
…
"Lotus Europa"
30.4
4
95.1
113
3.77
1.513
16.9
1
1
5
2
"Ford Pantera L"
15.8
8
351.0
264
4.22
3.17
14.5
0
1
5
4
"Ferrari Dino"
19.7
6
145.0
175
3.62
2.77
15.5
0
1
5
6
"Maserati Bora"
15.0
8
301.0
335
3.54
3.57
14.6
0
1
5
8
"Volvo 142E"
21.4
4
121.0
109
4.11
2.78
18.6
1
1
4
2
9.5 keep NA values
10 group by
10.1 average,min,max,sum
Code
# Group the mtcars DataFrame by the 'cyl' column and aggregate various statisticstbl_query = ( mtcars .group_by("cyl") .agg([ pl.col("hp").mean().alias("avg_hp"), # Calculate the mean of 'hp' and alias it as 'avg_hp' pl.col("hp").min().alias("min_hp"), # Calculate the minimum of 'hp' and alias it as 'min_hp' pl.col("hp").max().alias("max_hp"), # Calculate the maximum of 'hp' and alias it as 'max_hp' pl.col("disp").sum().alias("totol_disp") # Calculate the sum of 'disp' and alias it as 'totol_disp' ]))# Print the resulting aggregated DataFrametbl_query
shape: (3, 5)
cyl
avg_hp
min_hp
max_hp
totol_disp
i64
f64
i64
i64
f64
6
122.285714
105
175
1283.2
8
209.214286
150
335
4943.4
4
82.636364
52
113
1156.5
10.2 count record and count distinct record
Code
# Group the mtcars DataFrame by the 'cyl' column and count the number of rows in each groupmtcars.group_by("cyl").agg(pl.count().alias("n"))
shape: (3, 2)
cyl
n
i64
u32
6
7
8
14
4
11
Code
# Group the mtcars DataFrame by the 'cyl' column and count the number of unique 'hp' values for each groupmtcars.group_by("cyl").agg(pl.col("hp").n_unique().alias("n"))
shape: (3, 2)
cyl
n
i64
u32
4
10
6
4
8
9
11 order rows
Code
# Sort the small_mtcars DataFrame by the 'hp' column in ascending ordersmall_mtcars.sort("hp")
shape: (5, 3)
cyl
mpg
hp
i64
f64
i64
4
22.8
93
6
21.0
110
6
21.0
110
6
21.4
110
8
18.7
175
11.1 Sort in descending order
Code
# Sort the small_mtcars DataFrame by the 'hp' column in descending ordersmall_mtcars.sort("hp", descending=True)
shape: (5, 3)
cyl
mpg
hp
i64
f64
i64
8
18.7
175
6
21.0
110
6
21.0
110
6
21.4
110
4
22.8
93
11.2 Arrange by multiple variables
Code
# Sort the small_mtcars DataFrame by 'cyl' in ascending order and 'mpg' in descending ordersmall_mtcars.sort( by=["cyl", "mpg"], descending=[False, True] # True means descending)
shape: (5, 3)
cyl
mpg
hp
i64
f64
i64
4
22.8
93
6
21.4
110
6
21.0
110
6
21.0
110
8
18.7
175
12 join
Code
# Create a Polars DataFrame named lhslhs = pl.DataFrame({'id': [1,2,3], 'val': ['lhs.1', 'lhs.2', 'lhs.3']})# Create a Polars DataFrame named rhsrhs = pl.DataFrame({'id': [1,2,4], 'val': ['rhs.1', 'rhs.2', 'rhs.3']})
Code
# Display the lhs DataFramelhs
shape: (3, 2)
id
val
i64
str
1
"lhs.1"
2
"lhs.2"
3
"lhs.3"
Code
# Display the rhs DataFramerhs
shape: (3, 2)
id
val
i64
str
1
"rhs.1"
2
"rhs.2"
4
"rhs.3"
12.1 inner_join
Code
# Perform an inner join of lhs and rhs DataFrames on the 'id' columnresult = lhs.join(rhs, on="id", how="inner")# Display the resultresult
shape: (2, 3)
id
val
val_right
i64
str
str
1
"lhs.1"
"rhs.1"
2
"lhs.2"
"rhs.2"
12.2 full join
Code
# Perform a full outer join of lhs and rhs DataFrames on the 'id' columnresult = lhs.join(rhs, on="id", how="outer")# Display the resultresult
shape: (4, 4)
id
val
id_right
val_right
i64
str
i64
str
1
"lhs.1"
1
"rhs.1"
2
"lhs.2"
2
"rhs.2"
null
null
4
"rhs.3"
3
"lhs.3"
null
null
12.3 left join
Code
# Perform a left join of lhs and rhs DataFrames on the 'id' columnresult = lhs.join(rhs, on="id", how="left")# Display the resultresult
shape: (3, 3)
id
val
val_right
i64
str
str
1
"lhs.1"
"rhs.1"
2
"lhs.2"
"rhs.2"
3
"lhs.3"
null
12.4 anti join
keep data in left which not in right
Code
# Perform an anti-join: keep rows from lhs that do not have a match in rhs based on 'id'result = lhs.join(rhs, on="id", how="anti")# Display the resultresult
shape: (1, 2)
id
val
i64
str
3
"lhs.3"
keep data in right which not in left
Code
# Perform an anti-join: keep rows from rhs that do not have a match in lhs based on 'id'result = rhs.join(lhs, on="id", how="anti")# Display the resultresult
shape: (1, 2)
id
val
i64
str
4
"rhs.3"
13 Reshape tables
Code
# Create a Polars DataFrame named costscosts = pl.DataFrame({"id": [1, 2],"price_x": [0.1, 0.2],"price_y": [0.4, 0.5],"price_z": [0.7, 0.8],})# Display the DataFramecosts
shape: (2, 4)
id
price_x
price_y
price_z
i64
f64
f64
f64
1
0.1
0.4
0.7
2
0.2
0.5
0.8
13.1 Gather data long(wide to long)
Below 3 method will give same result
Code
# Unpivot (melt) the costs DataFrame from wide to long formatcosts_long = costs.unpivot( index="id", # Columns to keep as identifiers on=["price_x", "price_y", "price_z"], # Columns to melt variable_name="type", # Name for the melted column names value_name="price"# Name for the melted values)# Display the long format DataFramecosts_long
shape: (6, 3)
id
type
price
i64
str
f64
1
"price_x"
0.1
2
"price_x"
0.2
1
"price_y"
0.4
2
"price_y"
0.5
1
"price_z"
0.7
2
"price_z"
0.8
13.2 Spread data wide(long to wide)
Code
# Pivot the costs_long DataFrame from long to wide formatcosts_wide = costs_long.pivot( values="price", index="id", on="type")# Display the wide format DataFramecosts_wide
shape: (2, 4)
id
price_x
price_y
price_z
i64
f64
f64
f64
1
0.1
0.4
0.7
2
0.2
0.5
0.8
14 string
Code
# Create a Polars DataFrame named dfdf = pl.DataFrame({"text": ["abc", "DDD", "1243c", "aeEe"],"num": [3, 4, 7, 8]})# Display the DataFramedf
shape: (4, 2)
text
num
str
i64
"abc"
3
"DDD"
4
"1243c"
7
"aeEe"
8
14.1 upper case
Code
# Add a new column 'text_new' with the uppercase version of the 'text' columndf.with_columns( pl.col("text").str.to_uppercase().alias("text_new"))
shape: (4, 3)
text
num
text_new
str
i64
str
"abc"
3
"ABC"
"DDD"
4
"DDD"
"1243c"
7
"1243C"
"aeEe"
8
"AEEE"
14.2 lower case
Code
# Add a new column 'text_new' with the lowercase version of the 'text' columndf.with_columns( pl.col("text").str.to_lowercase().alias("text_new"))
shape: (4, 3)
text
num
text_new
str
i64
str
"abc"
3
"abc"
"DDD"
4
"ddd"
"1243c"
7
"1243c"
"aeEe"
8
"aeee"
14.3 match
Code
# Add multiple new columns based on string matching conditionsdf.with_columns([# 'text_new1': 'T' if 'text' is exactly 'abc', else 'F' pl.when(pl.col("text") =="abc").then(pl.lit("T")).otherwise(pl.lit("F")).alias("text_new1"),# 'text_new2': 'T' if 'text' starts with 'a', else 'F' pl.when(pl.col("text").str.starts_with("a")).then(pl.lit("T")).otherwise(pl.lit("F")).alias("text_new2"),# 'text_new3': 'T' if 'text' ends with 'c', else 'F' pl.when(pl.col("text").str.ends_with("c")).then(pl.lit("T")).otherwise(pl.lit("F")).alias("text_new3"),# 'text_new4': 'T' if 'text' contains '4', else 'F' pl.when(pl.col("text").str.contains("4")).then(pl.lit("T")).otherwise(pl.lit("F")).alias("text_new4"),])
shape: (4, 6)
text
num
text_new1
text_new2
text_new3
text_new4
str
i64
str
str
str
str
"abc"
3
"T"
"T"
"T"
"F"
"DDD"
4
"F"
"F"
"F"
"F"
"1243c"
7
"F"
"F"
"T"
"T"
"aeEe"
8
"F"
"T"
"F"
"F"
14.4 concatenation
Code
# Add a new column 'text_new1' by concatenating the 'text' column with itself, separated by " is "df.with_columns( (pl.col("text") +" is "+ pl.col("text")).alias("text_new1"))
shape: (4, 3)
text
num
text_new1
str
i64
str
"abc"
3
"abc is abc"
"DDD"
4
"DDD is DDD"
"1243c"
7
"1243c is 1243c"
"aeEe"
8
"aeEe is aeEe"
14.5 replace
Use .str.replace(…, regex=True) with regular expressions to replace patterns in strings.
For example, the code below uses “a.”, where . is called a wildcard–which matches any character.
Code
# Add a new column 'text_new1' by replacing patterns in the 'text' column using a regular expressiondf.with_columns( pl.col("text").str.replace(r"a.", "XX").alias("text_new1"))
shape: (4, 3)
text
num
text_new1
str
i64
str
"abc"
3
"XXc"
"DDD"
4
"DDD"
"1243c"
7
"1243c"
"aeEe"
8
"XXEe"
14.6 extract
Use str.extract() with a regular expression to pull out a matching piece of text.
text_new1 get rid of first ‘a’
text_new1 get rid of last ‘c’
Code
# Add new columns by extracting substrings from the 'text' column using regular expressionsdf.with_columns([# 'text_new1': extract everything after the first 'a' pl.col("text").str.extract(r"a(.*)", 1).alias("text_new1"),# 'text_new2': extract everything before the last 'c' pl.col("text").str.extract(r"(.*)c", 1).alias("text_new2")])
shape: (4, 4)
text
num
text_new1
text_new2
str
i64
str
str
"abc"
3
"bc"
"ab"
"DDD"
4
null
null
"1243c"
7
null
"1243"
"aeEe"
8
"eEe"
null
15 date
Polars provides robust functionalities for handling date and time data, including parsing, extracting components, and formatting.
Code
# Create a Polars DataFrame with 'dates' and 'times' columns containing string representations of dates and timesdf_dates = pl.DataFrame({"dates": ["2023-04-05", "2024-05-06"],"times": ["2023-04-05 06:07:08", "2024-05-06 07:08:09"]})# Display the DataFramedf_dates
shape: (2, 2)
dates
times
str
str
"2023-04-05"
"2023-04-05 06:07:08"
"2024-05-06"
"2024-05-06 07:08:09"
15.1 Extracting Date Components
You can extract various components like year, month, day, hour, minute, second from datetime objects.
# Convert 'dates' to date type and extract year, month, daydf_dates.with_columns([ pl.col("dates").str.to_date().dt.year().alias("year"), pl.col("dates").str.to_date().dt.month().alias("month"), pl.col("dates").str.to_date().dt.day().alias("day"),])
15.2 Formatting Dates
Dates can be formatted into different string representations using strftime().
# Convert 'times' to datetime type and format it as YYYY-MM-DD HH:MM:SSdf_dates.with_columns( pl.col("times").str.to_datetime().dt.strftime("%Y-%m-%d %H:%M:%S").alias("formatted_time"))
Code
# Add new columns by converting string columns to datetime and date objectsdf_dates2 = df_dates.with_columns(# Convert 'times' column to datetime objects and alias as 'real_times' pl.col("times").str.to_datetime().alias("real_times"),# Convert 'dates' column to date objects and alias as 'real_dates' pl.col("dates").str.to_date().alias("real_dates"))# Display the DataFrame with new columnsdf_dates2
shape: (2, 4)
dates
times
real_times
real_dates
str
str
datetime[μs]
date
"2023-04-05"
"2023-04-05 06:07:08"
2023-04-05 06:07:08
2023-04-05
"2024-05-06"
"2024-05-06 07:08:09"
2024-05-06 07:08:09
2024-05-06
Code
# Generate descriptive statistics for the df_dates2 DataFramedf_dates2.describe()
shape: (9, 5)
statistic
dates
times
real_times
real_dates
str
str
str
str
str
"count"
"2"
"2"
"2"
"2"
"null_count"
"0"
"0"
"0"
"0"
"mean"
null
null
"2023-10-20 18:37:38.500000"
"2023-10-20 12:00:00"
"std"
null
null
null
null
"min"
"2023-04-05"
"2023-04-05 06:07:08"
"2023-04-05 06:07:08"
"2023-04-05"
"25%"
null
null
"2023-04-05 06:07:08"
"2023-04-05"
"50%"
null
null
"2024-05-06 07:08:09"
"2024-05-06"
"75%"
null
null
"2024-05-06 07:08:09"
"2024-05-06"
"max"
"2024-05-06"
"2024-05-06 07:08:09"
"2024-05-06 07:08:09"
"2024-05-06"
16 using polars with database
16.1 set up a DuckDB database, with an mtcars table.
Code
# Connect to DuckDB (in-memory by default)con = duckdb.connect()# Sample mtcars data as a list of tuplesmtcars_data = [ (21.0, 6, 160, 110, 3.90, 2.62, 16.46, 0, 1, 4, 4), (21.0, 6, 160, 110, 3.90, 2.875, 17.02, 0, 1, 4, 4), (22.8, 4, 108, 93, 3.85, 2.320, 18.61, 1, 1, 4, 1), (21.4, 6, 258, 110, 3.08, 3.215, 19.44, 1, 0, 3, 1),]# Define column names for the DataFramecolumns = ["mpg", "cyl", "disp", "hp", "drat", "wt", "qsec","vs", "am", "gear", "carb"]# Create Polars DataFrame from the sample data and column namesdf = pl.DataFrame(mtcars_data, schema=columns)
16.2 create table in duckdb
Code
# Register the Polars DataFrame 'df' as a table named 'mtcars' in the DuckDB connectioncon.register("mtcars", df)
<duckdb.duckdb.DuckDBPyConnection object at 0x12a871470>
16.3 show tables in duckdb
Code
# Execute an SQL query to show all tables and fetch the resultscon.execute("SHOW TABLES").fetchall()
[('mtcars',)]
16.4 create query
Code
# Define an SQL query to select 'mpg', 'cyl', and 'hp' from the 'mtcars' table where 'cyl' is 6query ="SELECT mpg, cyl, hp FROM mtcars WHERE cyl = 6"
16.5 get data
Code
# Execute the SQL query and retrieve the result as a Polars DataFrameresult = con.execute(query).pl()# Display the resultresult
shape: (3, 3)
mpg
cyl
hp
f64
i64
i64
21.0
6
110
21.0
6
110
21.4
6
110
16.6 LazyFrame
Code
# Execute the SQL query and retrieve the result as a Polars LazyFramelazy_df = con.execute(query).pl().lazy()# Display the LazyFramelazy_df
naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)
DF ["mpg", "cyl", "hp"]; PROJECT */3 COLUMNS
16.7 Collect to DataFrame
because lazy expressions,the collect function is actually running the sql.
Code
# Collect the LazyFrame, executing the query and returning a Polars DataFrameresult_df = lazy_df.collect()# Display the resulting DataFrameresult_df
---title: "Data manipulation with polars"author: "Tony Duan"execute: warning: false error: falseformat: html: toc: true toc-location: right code-fold: show code-tools: true number-sections: true code-block-bg: true code-block-border-left: "#31BAE9"---Polars is a DataFrame interface on top of an OLAP Query Engine implemented in Rust using [Apache Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html) as the memory model.- Lazy \| eager execution- Multi-threaded- SIMD- Query optimization- Powerful expression API- Hybrid Streaming (larger-than-RAM datasets)- Rust \| Python \| NodeJS \| R \| ...```{r}#Sys.setenv(RETICULATE_PYTHON = "/Library/Frameworks/Python.framework/Versions/3.11/bin/python3.11")library(reticulate)#use_python("/Library/Frameworks/Python.framework/Versions/3.11/bin/python3.11")py_require(c("polars", "pandas", "numpy", "matplotlib", "duckdb","datetime","seaborn","pyarrow"))```## load package```{python}# Import the pandas library for data manipulationimport pandas as pd# Import numpy for numerical operationsimport numpy as np# Import matplotlib.pylab for plottingimport matplotlib.pylab as plt# Import seaborn for statistical data visualizationimport seaborn as sns# Import polars for DataFrame operationsimport polars as pl# Import col and when from polars for expression buildingfrom polars import col, when# Import datetime for date and time operationsfrom datetime import datetime# Import duckdb for in-memory SQL database operationsimport duckdb```## import data from csv ```{python}# Read the CSV file './data/mtcars.csv' into a Polars DataFramemtcars = pl.read_csv("./data/mtcars.csv")```## import data from pandas```{python}# Read the CSV file into a pandas DataFramemtcars_pd= pd.read_csv("./data/mtcars.csv")# Print the type of mtcars_pdtype(mtcars_pd)# Convert the pandas DataFrame to a Polars DataFramemtcars = pl.from_pandas(mtcars_pd)# Print the type of mtcarstype(mtcars)``````{python}# Select the 'cyl', 'mpg', and 'hp' columns from the mtcars DataFrame and take the first 5 rowssmall_mtcars = mtcars.select(["cyl", "mpg","hp"]).head(5)```## select column### get column names```{python}# Get the column names of the small_mtcars DataFramesmall_mtcars.columns```### select columns by name```{python}# Select the 'cyl' and 'mpg' columns from the small_mtcars DataFramesmall_mtcars.select(["cyl", "mpg"])```### select columns by name match with 'p'```{python}# Select columns with 'p' in the namecols_with_p = [col for col in small_mtcars.columns if'p'in col]# Select the columns identified in cols_with_p from the small_mtcars DataFramesmall_mtcars.select(cols_with_p)```### select columns by index#### select first and 3rd columns```{python}# Select the first (index 0) and third (index 2) columns of the small_mtcars DataFramesmall_mtcars.select([small_mtcars.columns[0], small_mtcars.columns[2]])```#### select first to 3rd columns```{python}# Select columns from index 0 up to (but not including) index 3small_mtcars.select(small_mtcars.columns[0:3])```## drop column```{python}# Drop the column named "cyl" from the small_mtcars DataFramesmall_mtcars.drop("cyl")```## Renaming column```{python}# Rename the column 'mpg' to 'new_name_mpg'small_mtcars.rename({"mpg": "new_name_mpg"})```## Create column### Mutate```{python}# Apply transformations to the mtcars DataFrameresult = ( mtcars.head() .with_columns([# Create a new column 'gear2' by adding 1 to the 'gear' column (col("gear") +1).alias("gear2")# Create a new column 'gear3': if 'gear' > 3 then 'long', else 'short' ,when(col("gear") >3).then(pl.lit("long")) .otherwise(pl.lit("short")).alias("gear3")# Create a new column 'qsec2' with conditional logic based on 'qsec' values ,when(col("qsec") <=17).then(pl.lit("short")) .when(col("qsec") <=18).then(pl.lit("Medium")) .otherwise(pl.lit("long")).alias("qsec2") ]))# Print the resulting DataFrameprint(result)```### Transmute,create column and only keep this column```{python}# Select the 'gear' column from the first few rows of mtcars, add 1 to it, and alias it as 'gear2'mtcars.head().select([ (col("gear") +1).alias("gear2") ])```## Filter rows```{python}# Filter rows where the 'gear' column is equal to 4mtcars.filter( (col("gear") ==4))```### Filters with AND conditions```{python}# Filter rows where 'cyl' is greater than 4 AND 'gear' is equal to 5mtcars.filter( (col("cyl") >4) & (col("gear") ==5))```### Filters with OR conditions```{python}# Filter rows where 'cyl' is equal to 6 OR 'gear' is equal to 5mtcars.filter( (col("cyl") ==6) | (col("gear") ==5))```### filter row with index#### first 3```{python}# Select the first 3 rows of the small_mtcars DataFramesmall_mtcars.head(3)```#### last 3```{python}# Select the last 3 rows of the small_mtcars DataFramesmall_mtcars.tail(3)```#### 5th rows```{python}# Select the row at index 4 (which is the 5th row)mtcars[4]```#### 1 and 5th rows```{python}# Select rows at index 0 (1st row) and 4 (5th row)mtcars[[0, 4]]```#### get 1 to 4 rows```{python}# Select rows from index 0 up to (but not including) index 4mtcars[0:4]```#### get ramdon 5 rows```{python}# Select 5 random rows from the mtcars DataFrame, with a fixed seed for reproducibilitymtcars.sample(n=5, seed=42)```## Append### append by row```{python}# Select rows from index 0 up to (but not including) index 4data1 = mtcars[0:4]# Select rows from index 10 up to (but not including) index 11data2 = mtcars[10:11]# Concatenate data1 and data2 DataFrames verticallypl.concat([data1, data2], how="vertical")```### append by column```{python}# Select the 'mpg' column from small_mtcarsdata1 = small_mtcars.select("mpg")# Select the 'cyl' column from small_mtcarsdata2 = small_mtcars.select("cyl")# Concatenate data1 and data2 DataFrames horizontallypl.concat([data1, data2], how="horizontal")```### Dropping NA values```{python}# Drop all rows that contain any null valuesmtcars.drop_nulls()```### To drop rows with NAs in specific columns:```{python}# Drop rows that have null values in either the "mpg" or "disp" columnsmtcars.drop_nulls(subset=["mpg", "disp"])```### keep NA values## group by### average,min,max,sum```{python}# Group the mtcars DataFrame by the 'cyl' column and aggregate various statisticstbl_query = ( mtcars .group_by("cyl") .agg([ pl.col("hp").mean().alias("avg_hp"), # Calculate the mean of 'hp' and alias it as 'avg_hp' pl.col("hp").min().alias("min_hp"), # Calculate the minimum of 'hp' and alias it as 'min_hp' pl.col("hp").max().alias("max_hp"), # Calculate the maximum of 'hp' and alias it as 'max_hp' pl.col("disp").sum().alias("totol_disp") # Calculate the sum of 'disp' and alias it as 'totol_disp' ]))# Print the resulting aggregated DataFrametbl_query```### count record and count distinct record```{python}# Group the mtcars DataFrame by the 'cyl' column and count the number of rows in each groupmtcars.group_by("cyl").agg(pl.count().alias("n"))``````{python}# Group the mtcars DataFrame by the 'cyl' column and count the number of unique 'hp' values for each groupmtcars.group_by("cyl").agg(pl.col("hp").n_unique().alias("n"))```## order rows```{python}# Sort the small_mtcars DataFrame by the 'hp' column in ascending ordersmall_mtcars.sort("hp")```### Sort in descending order```{python}# Sort the small_mtcars DataFrame by the 'hp' column in descending ordersmall_mtcars.sort("hp", descending=True)```### Arrange by multiple variables```{python}# Sort the small_mtcars DataFrame by 'cyl' in ascending order and 'mpg' in descending ordersmall_mtcars.sort( by=["cyl", "mpg"], descending=[False, True] # True means descending)```## join```{python}# Create a Polars DataFrame named lhslhs = pl.DataFrame({'id': [1,2,3], 'val': ['lhs.1', 'lhs.2', 'lhs.3']})# Create a Polars DataFrame named rhsrhs = pl.DataFrame({'id': [1,2,4], 'val': ['rhs.1', 'rhs.2', 'rhs.3']})``````{python}# Display the lhs DataFramelhs``````{python}# Display the rhs DataFramerhs```### inner_join```{python}# Perform an inner join of lhs and rhs DataFrames on the 'id' columnresult = lhs.join(rhs, on="id", how="inner")# Display the resultresult```### full join```{python}# Perform a full outer join of lhs and rhs DataFrames on the 'id' columnresult = lhs.join(rhs, on="id", how="outer")# Display the resultresult```### left join```{python}# Perform a left join of lhs and rhs DataFrames on the 'id' columnresult = lhs.join(rhs, on="id", how="left")# Display the resultresult```### anti joinkeep data in left which not in right```{python}# Perform an anti-join: keep rows from lhs that do not have a match in rhs based on 'id'result = lhs.join(rhs, on="id", how="anti")# Display the resultresult```keep data in right which not in left```{python}# Perform an anti-join: keep rows from rhs that do not have a match in lhs based on 'id'result = rhs.join(lhs, on="id", how="anti")# Display the resultresult```## Reshape tables```{python}# Create a Polars DataFrame named costscosts = pl.DataFrame({"id": [1, 2],"price_x": [0.1, 0.2],"price_y": [0.4, 0.5],"price_z": [0.7, 0.8],})# Display the DataFramecosts```### Gather data long(wide to long)Below 3 method will give same result```{python}# Unpivot (melt) the costs DataFrame from wide to long formatcosts_long = costs.unpivot( index="id", # Columns to keep as identifiers on=["price_x", "price_y", "price_z"], # Columns to melt variable_name="type", # Name for the melted column names value_name="price"# Name for the melted values)# Display the long format DataFramecosts_long```### Spread data wide(long to wide)```{python}# Pivot the costs_long DataFrame from long to wide formatcosts_wide = costs_long.pivot( values="price", index="id", on="type")# Display the wide format DataFramecosts_wide```## string```{python}# Create a Polars DataFrame named dfdf = pl.DataFrame({"text": ["abc", "DDD", "1243c", "aeEe"],"num": [3, 4, 7, 8]})# Display the DataFramedf```### upper case```{python}# Add a new column 'text_new' with the uppercase version of the 'text' columndf.with_columns( pl.col("text").str.to_uppercase().alias("text_new"))```### lower case```{python}# Add a new column 'text_new' with the lowercase version of the 'text' columndf.with_columns( pl.col("text").str.to_lowercase().alias("text_new"))```### match```{python}# Add multiple new columns based on string matching conditionsdf.with_columns([# 'text_new1': 'T' if 'text' is exactly 'abc', else 'F' pl.when(pl.col("text") =="abc").then(pl.lit("T")).otherwise(pl.lit("F")).alias("text_new1"),# 'text_new2': 'T' if 'text' starts with 'a', else 'F' pl.when(pl.col("text").str.starts_with("a")).then(pl.lit("T")).otherwise(pl.lit("F")).alias("text_new2"),# 'text_new3': 'T' if 'text' ends with 'c', else 'F' pl.when(pl.col("text").str.ends_with("c")).then(pl.lit("T")).otherwise(pl.lit("F")).alias("text_new3"),# 'text_new4': 'T' if 'text' contains '4', else 'F' pl.when(pl.col("text").str.contains("4")).then(pl.lit("T")).otherwise(pl.lit("F")).alias("text_new4"),])```### concatenation```{python}# Add a new column 'text_new1' by concatenating the 'text' column with itself, separated by " is "df.with_columns( (pl.col("text") +" is "+ pl.col("text")).alias("text_new1"))```### replaceUse .str.replace(..., regex=True) with regular expressions to replace patterns in strings.For example, the code below uses "a.", where . is called a wildcard–which matches any character.```{python}# Add a new column 'text_new1' by replacing patterns in the 'text' column using a regular expressiondf.with_columns( pl.col("text").str.replace(r"a.", "XX").alias("text_new1"))```### extractUse str.extract() with a regular expression to pull out a matching piece of text.text_new1 get rid of first 'a'text_new1 get rid of last 'c'```{python}# Add new columns by extracting substrings from the 'text' column using regular expressionsdf.with_columns([# 'text_new1': extract everything after the first 'a' pl.col("text").str.extract(r"a(.*)", 1).alias("text_new1"),# 'text_new2': extract everything before the last 'c' pl.col("text").str.extract(r"(.*)c", 1).alias("text_new2")])```## datePolars provides robust functionalities for handling date and time data, including parsing, extracting components, and formatting.```{python}# Create a Polars DataFrame with 'dates' and 'times' columns containing string representations of dates and timesdf_dates = pl.DataFrame({"dates": ["2023-04-05", "2024-05-06"],"times": ["2023-04-05 06:07:08", "2024-05-06 07:08:09"]})# Display the DataFramedf_dates```### Extracting Date ComponentsYou can extract various components like year, month, day, hour, minute, second from datetime objects.```python# Convert 'dates' to date type and extract year, month, daydf_dates.with_columns([ pl.col("dates").str.to_date().dt.year().alias("year"), pl.col("dates").str.to_date().dt.month().alias("month"), pl.col("dates").str.to_date().dt.day().alias("day"),])```### Formatting DatesDates can be formatted into different string representations using `strftime()`.```python# Convert 'times' to datetime type and format it as YYYY-MM-DD HH:MM:SSdf_dates.with_columns( pl.col("times").str.to_datetime().dt.strftime("%Y-%m-%d %H:%M:%S").alias("formatted_time"))``````{python}# Add new columns by converting string columns to datetime and date objectsdf_dates2 = df_dates.with_columns(# Convert 'times' column to datetime objects and alias as 'real_times' pl.col("times").str.to_datetime().alias("real_times"),# Convert 'dates' column to date objects and alias as 'real_dates' pl.col("dates").str.to_date().alias("real_dates"))# Display the DataFrame with new columnsdf_dates2``````{python}# Generate descriptive statistics for the df_dates2 DataFramedf_dates2.describe()```## using polars with database### set up a DuckDB database, with an mtcars table.```{python}# Connect to DuckDB (in-memory by default)con = duckdb.connect()# Sample mtcars data as a list of tuplesmtcars_data = [ (21.0, 6, 160, 110, 3.90, 2.62, 16.46, 0, 1, 4, 4), (21.0, 6, 160, 110, 3.90, 2.875, 17.02, 0, 1, 4, 4), (22.8, 4, 108, 93, 3.85, 2.320, 18.61, 1, 1, 4, 1), (21.4, 6, 258, 110, 3.08, 3.215, 19.44, 1, 0, 3, 1),]# Define column names for the DataFramecolumns = ["mpg", "cyl", "disp", "hp", "drat", "wt", "qsec","vs", "am", "gear", "carb"]# Create Polars DataFrame from the sample data and column namesdf = pl.DataFrame(mtcars_data, schema=columns)```### create table in duckdb```{python}# Register the Polars DataFrame 'df' as a table named 'mtcars' in the DuckDB connectioncon.register("mtcars", df)```### show tables in duckdb```{python}# Execute an SQL query to show all tables and fetch the resultscon.execute("SHOW TABLES").fetchall()```### create query```{python}# Define an SQL query to select 'mpg', 'cyl', and 'hp' from the 'mtcars' table where 'cyl' is 6query ="SELECT mpg, cyl, hp FROM mtcars WHERE cyl = 6"```### get data```{python}# Execute the SQL query and retrieve the result as a Polars DataFrameresult = con.execute(query).pl()# Display the resultresult```### LazyFrame```{python}# Execute the SQL query and retrieve the result as a Polars LazyFramelazy_df = con.execute(query).pl().lazy()# Display the LazyFramelazy_df```### Collect to DataFramebecause lazy expressions,the collect function is actually running the sql.```{python}# Collect the LazyFrame, executing the query and returning a Polars DataFrameresult_df = lazy_df.collect()# Display the resulting DataFrameresult_df```## reference:https://docs.pola.rs/api/python/stable/reference/index.html